R Markdown

This is R Markdown file produced to evidence the spatial data analysis skills learned throughout CASA0005: Geographical Information System and Sience, specifically for part 3 of its assessment.

library(ggmap)
library(devtools)

#Data source: http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-monthly-update-new-version.csv
#This data contains the 2018 annual price paid data in the UK
setwd("D:/Xin/MSc/Modules/CASA0005_GISS/Assessment/Assessment Part 3/")
raw<-read.csv("pp-monthly-update-new-version.csv",header=FALSE,sep=",")
Londonclean$X<-NULL

#As the raw data does not come with header, they need to be added manually.
colnames(raw)<-c("id","price","transfer_time","postcode","prop_type","new_build","lease_type","primary_name","secondary_name","street","locality","town","district","county","ppd_type","record_status")
nrow(raw) #Contaisn 115219 rows

#Slice the 2018 price paid data in Greater London area
Londonraw<- subset(raw, county=="GREATER LONDON")
nrow(Londonraw) #Trimmed down to 12636 rows

#Write to csv
write.csv("LondonppNov2018_raw.csv")

Londonraw<- subset(Londonraw, record_status=="A")
nrow(Londonraw) #Trimmed down to 11556 rows
Londonraw<- subset(Londonraw, ppd_type=="A")
nrow(Londonraw) #Trimmed down to 8919 rows
Londonraw<- subset(Londonraw, postcode!="")
nrow(Londonraw) #Trimmed down to 8917 rows, as there are some missing postcodes

#Slice the useful columns
Londonraw<-Londonraw[c("price","postcode","prop_type","new_build","lease_type","town","district","county")]

#Write to csv
write.csv(Londonraw,"LondonppNov2018raw.csv")

#Set google map api for geocoding purposes
my_api="AIzaSyA5EUX3ZcX4u6b_OhogTM9FaWlxu4U5fJc"
register_google(key=my_api)
ggmap(get_googlemap())

#highlight this whole block and create this function to access the Google Places API
url <- function(address, return.call = "json", sensor = "false") {
  root <- "https://maps.google.com/maps/api/geocode/"
  u <- paste(root, return.call, "?address=", address, "&sensor=", sensor, "&key=", my_api, sep = "")
  return(URLencode(u))
}

library(tidyverse)
library(RCurl)
library(RJSONIO)
library(plyr)

#highlight this whole block and create this function to geocode some places just from a random list of treasure hunt locations
geoCode <- function(address,verbose=FALSE) {
  if(verbose) cat(address,"\n")
  u <- url(address)
  doc <- getURL(u)
  x <- fromJSON(doc,simplify = FALSE)
  if(x$status=="OK") {
    lat <- x$results[[1]]$geometry$location$lat
    lng <- x$results[[1]]$geometry$location$lng
    location_type <- x$results[[1]]$geometry$location_type
    formatted_address <- x$results[[1]]$formatted_address
    return(c(lat, lng, location_type, formatted_address))
    Sys.sleep(0.5)
  } else {
  return(c(NA,NA,NA, NA))
  }
}

#WARNING: The section below takes hours to run
#now use the geoCode() function (which calls the URL function) to geocode our list of places
#for loop to cycle through every treasure hunt location
i=1
for(i in 1:nrow(Londonraw)){
  # Every nine records, pause 3 seconds so that the API doesn't kick us off...
  if(i %% 9 == 0) Sys.sleep(3)
  #now create a temporary list of useful elements
  tempdf <- as.list(geoCode(Londonraw[i,4]))
  #and write these back into our dataframe
  Londonraw[i,17] <- tempdf[1]
  Londonraw[i,18] <- tempdf[2]
  Londonraw[i,19] <- tempdf[4]
  print(i)
}
#WARNING: The section above takes hours to run

# rename the columns
names(Londonraw) <- c("price","postcode","prop_type","new_build","lease_type","town","district","county","lat","lon","GoogleAddress")
head(Londonraw)

#Transfer to new dataframe
Londonclean<-Londonraw

#Remove space in names for convenience
Londonclean$town<-gsub(" ","_",Londonclean$town)
Londonclean$district<-gsub(" ","_",Londonclean$district)

#Convert price to £k units
Londonclean$price<-round(Londonclean$price/1000)

#Add new build indicator column
Londonclean$new_build_ind<-Londonclean$new_build
Londonclean$new_build_ind<-sub("Y",1,Londonclean$new_build_ind)
Londonclean$new_build_ind<-sub("N",0,Londonclean$new_build_ind)
Londonclean$new_build_ind<-as.numeric(Londonclean$new_build_ind)

#Add flat indicator column
Londonclean$flat_ind<-Londonclean$prop_type
Londonclean$flat_ind<-sub("F",1,Londonclean$flat_ind)
Londonclean$flat_ind<-sub("S",0,Londonclean$flat_ind)
Londonclean$flat_ind<-sub("T",0,Londonclean$flat_ind)
Londonclean$flat_ind<-sub("D",0,Londonclean$flat_ind)
Londonclean$flat_ind<-as.numeric(Londonclean$flat_ind)

#Add leasehold indicator column
Londonclean$lease_ind<-Londonclean$lease_type
Londonclean$lease_ind<-sub("L",1,Londonclean$lease_ind)
Londonclean$lease_ind<-sub("F",0,Londonclean$lease_ind)
Londonclean$lease_ind<-as.numeric(Londonclean$lease_ind)

#Save frequently!!!
write.csv(Londonclean,"LondonppNov2018clean.csv")
library(ggplot2)
library(devtools)
Londonclean<-read.csv("LondonppNov2018clean.csv",header=TRUE,sep=",")
Londonclean$X<-NULL

#Calculate the proportions of new build  
sum(Londonclean$new_build_ind)/nrow(Londonclean) #15.52092%
## [1] 0.1552092
#Calculate the proportions of flats 
sum(Londonclean$flat_ind)/nrow(Londonclean) #55.42%
## [1] 0.5542223
#Calculate the proportions of flats
sum(Londonclean$lease_ind)/nrow(Londonclean) #56.59%
## [1] 0.5658854
#Calculate the average price
London_avg_price=ave(Londonclean$price)[1]
London_avg_price #average calculated is appx £609k per property
## [1] 608.8424
library(plyr)

#Observe the proption by property types and district/Boroughs
#hist_proptype_borough<-ggplot(Londonclean %>% count(district, prop_type) %>%    # Group by district and property type, then count number in each group
#          mutate(pct=x/sum(x),               # Calculate percent within each region
#                 ypos = cumsum(x) - 0.5*x),  # Calculate label positions
#        aes(district, x, fill=prop_type)) +
#   geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 60, hjust = 1))
#plot(hist_proptype_borough)

#observe average price by borough
pricebyBorough<-aggregate(Londonclean$price, by=list(borough=Londonclean$district), FUN=mean)
pricebyBorough$x<-round(pricebyBorough$x)
names(pricebyBorough)=c("borough","price_per_property")
bar_price_borough<-ggplot(data=pricebyBorough,aes(x=borough,y=price_per_property))+geom_bar(stat="identity")+theme(axis.text.x = element_text(angle = 60,hjust = 1))+geom_hline(yintercept=London_avg_price,linetype="dashed",col="red")
plot(bar_price_borough)

#Save result dataframe to csv
write.csv(Londonclean,"LondonppNov2018clean.csv")
library(ggplot2)
#Read in cleaned csv
setwd("D:/Xin/MSc/Modules/CASA0005_GISS/Assessment/Assessment Part 3/")
Londonclean<-read.csv("LondonppNov2018clean.csv",header=TRUE,sep=",")
Londonclean$X<-NULL
nrow(Londonclean) #Should have 8917
## [1] 8917
#Detect price outliers in a boxplot, we can observe outliers as data points that is located outside the whiskers of the boxplot, that is "outside 1.5 times the interquartile range above the upper quartile and bellow the lower quartile". Source: https://www.r-statistics.com/tag/boxplot-outlier/
#It can be observed most outliers lie on the higher tail of price data
bp<-boxplot(Londonclean$price)

#Return the five value summary of boxplot
bp$stats
##      [,1]
## [1,]   35
## [2,]  350
## [3,]  460
## [4,]  640
## [5,] 1075
## attr(,"class")
##         1 
## "integer"
#Return the number of outliers detected by boxplot
length(bp$out)
## [1] 713
#comparing the means in price with and without outliers
mean(Londonclean$price[!Londonclean$price %in% bp$out])
## [1] 475.8
mean(Londonclean$price)
## [1] 608.8424
#Clean a sf dataframe with the outliers removed
Londonclean_rmout<-Londonclean[!Londonclean$price %in% boxplot.stats(Londonclean$price)$out,]
nrow(Londonclean_rmout) #8,204 rows
## [1] 8204
#write a new .csv file to your working directory
write.csv(Londonclean_rmout, "LondonppNov2018cleanrmout.csv")
library(sf)
library(tmap)
setwd("D:/Xin/MSc/Modules/CASA0005_GISS/Assessment/Assessment Part 3")
Londonclean<-read.csv("LondonppNov2018cleanrmout.csv",header=TRUE,sep=",")
Londonclean$X<-NULL
nrow(Londonclean) #Should have 8204 rows
## [1] 8204
#Set crs
ukgrid<-"+init=epsg:27700"
latlon<-"+init=epsg:4326"

#Convert cleaned dataset from dataframe format into sf objects
Londonclean_sf<-st_as_sf(Londonclean,coords = c("lon","lat"),crs=latlon)

library(geojsonio)

start.time <- Sys.time()
######
EnglandMap<-geojson_read("http://geoportal.statistics.gov.uk/datasets/8edafbe3276d4b56aec60991cbddda50_2.geojson", what = "sp")

#Crop the London shapefile only
LondonMap <- EnglandMap[grep("^E09",EnglandMap@data$lad15cd),]
######
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken #1.38min to run
## Time difference of 33.49405 secs
library(spatstat)
library(sp)

#Convert the crs to ukgrid and set an observation window for spatstat to carry out KDE analysis
#LondonMap_latlon<-spTransform(LondonMap,latlon)
#summary(LondonMap_latlon)
#window<-as.owin(LondonMap_latlon)

#Create a point pattern object for the hunt locations
#Londonclean.ppp<-ppp(x=st_coordinates(Londonclean_sf)[,2],y=st_coordinates(Londonclean_sf)[,1],window=window)
#plot(Londonclean.ppp,pch=16,cex=1,main="November 2018 London property transactions")

#Plot the kernel density estimation map
#plot(density(Londonclean.ppp, sigma = 1000))

#Convert sp objects into simple features
LondonMap_sf<-st_as_sf(LondonMap)

#Convert the crs to EPSG:4326 and check
LondonMap_sf<-st_transform(LondonMap_sf,latlon)

#Read in London stations
library(nngeo)
stn <- st_read("https://www.doogal.co.uk/LondonStationsKML.ashx")
## Reading layer `London stations with zone information' from data source `https://www.doogal.co.uk/LondonStationsKML.ashx' using driver `KML'
## Simple feature collection with 641 features and 2 fields
## geometry type:  POINT
## dimension:      XYZ
## bbox:           xmin: -0.61143 ymin: 51.28216 xmax: 0.329952 ymax: 51.74702
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
stn_sf<-st_transform(stn,latlon)


#Identify the indices of the nearest tube station, the output is expressed in index of the corresponding station in stn_sf dataframe
Londonclean_sf$near<-st_nn(x=Londonclean_sf,y=stn_sf,k=1)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |====================                                             |  32%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |=================================                                |  52%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |===================================                              |  55%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |=====================================                            |  58%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=======================================                          |  61%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |==============================================                   |  72%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |================================================                 |  75%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |===========================================================      |  92%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=============================================================    |  95%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%
#As the indices of the nearest station in stn_sf are computed into a list, trim the list into single values for each property point
for (i in 1:nrow(Londonclean)) {
  Londonclean_sf$near_stn_index[i]<-sum(as.numeric(Londonclean_sf$near[[i]]))
}

#Trim the redundant station index column in list format
Londonclean$near_stn_index<-Londonclean_sf$near_stn_index
Londonclean_sf$near_stn_index<-NULL

#Drop redundant index column
Londonclean$X<-NULL
Londonclean$X.1<-NULL
Londonclean_sf$X<-NULL
Londonclean_sf$X.1<-NULL

#Set parameters to locate the desired columns
n=ncol(Londonclean)
m=grep("near_stn_index", colnames(Londonclean))

#Look up for the nearest station names and coordinates from stn_sf dataframe
start.time <- Sys.time()
######
for (i in 1:nrow(Londonclean)) {
  Londonclean[i,n+1]<-stn_sf[Londonclean$near_stn_index[i],1]
  Londonclean[i,n+2]<-st_coordinates(stn_sf[Londonclean[i,m],])[,2]
  Londonclean[i,n+3]<-st_coordinates(stn_sf[Londonclean[i,m],])[,1]
}
######
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken #1.854117mins to run
## Time difference of 1.598767 mins
#Ensure the header for the extended dataframe is correctly named
#Ensure the header for the extended dataframe is correctly named
names(Londonclean)<-c("price","postcode","prop_type","new_build","lease_type","town","district","county","lat","lon","GoogleAddress","new_build_ind","flat_ind","lease_ind","near_stn_index","near_stn_name","near_stn_lat","near_stn_lon")

#Below has taken the references from link to calculate the distance in meteres between two coordinates extracted: https://stackoverflow.com/questions/42119438/calculate-distance-between-two-long-lat-coordinates-in-a-dataframe

start.time <- Sys.time()
######
library(geosphere)
for (i in 1:nrow(Londonclean)) {

  a<-Londonclean$lon[i]
  b<-Londonclean$lat[i]
  c<-Londonclean$near_stn_lon[i]
  d<-Londonclean$near_stn_lat[i]

  Londonclean$near_stn_dist[i]<-distm(c(a,b),c(c,d), fun = distHaversine)
}
######
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken #5.5secs to run
## Time difference of 4.378713 secs
#Round the distance column to the nearest integer
Londonclean$near_stn_dist<-round(Londonclean$near_stn_dist)

#Save the cleaned price paid data with distance in metres
write.csv(Londonclean,"LondonppNov2018distance.csv")

#Plot the data point on map, where the size of symbols depend on the price paid for that property
tmap_mode("view")
tm_shape(Londonclean_sf)+tm_symbols(col="red",size="price",perceptual=TRUE,alpha=0.5,scale=0.4)+tm_shape(stn_sf)+tm_dots(col="black",size=0.01)
library(ggplot2)
library(geosphere)
library(tmap)
library(tidyverse)
library(dplyr)

#Read in prepared csv
setwd("D:/Xin/MSc/Modules/CASA0005_GISS/Assessment/Assessment Part 3/")
Londonclean<-read.csv("LondonppNov2018distance.csv",header=TRUE,sep=",")
Londonclean$X<-NULL

#Descriptive data
summary_descr<-summary(Londonclean)
write.csv(summary_descr,"summary_descriptive.csv")

#Plot price paid and distance to nearest station histograms
qplot(price,data=Londonclean,geom="histogram")

qplot(near_stn_dist,data=Londonclean,geom="histogram")

#Plot price against distance to station as a scatter plot to assess possibilities of simple linear regression
qplot(near_stn_dist,price,data=Londonclean,geom="point")+geom_smooth(method="lm",se=FALSE,size=1,col="red")

#Now, test simple linear regression model
sl_near_stn_dist<-lm(price~near_stn_dist,data=Londonclean)
plot(sl_near_stn_dist)

sink("sl_near_stn_dist.txt")
summary(sl_near_stn_dist)
## 
## Call:
## lm(formula = price ~ near_stn_dist, data = Londonclean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -432.83 -134.30  -34.86  105.26  625.18 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   506.786538   3.495893  144.97   <2e-16 ***
## near_stn_dist  -0.042579   0.003842  -11.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 190.1 on 8202 degrees of freedom
## Multiple R-squared:  0.01475,    Adjusted R-squared:  0.01463 
## F-statistic: 122.8 on 1 and 8202 DF,  p-value: < 2.2e-16
sink()

#Test simple lm for new build
sl_near_stn_dist_slice_new_build<-lm(price~near_stn_dist,data=subset(Londonclean,new_build_ind==1))
plot(sl_near_stn_dist_slice_new_build)

sink("sl_near_stn_dist_slice_new_build.txt")
summary(sl_near_stn_dist_slice_new_build)
## 
## Call:
## lm(formula = price ~ near_stn_dist, data = subset(Londonclean, 
##     new_build_ind == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -389.57 -125.76  -34.71  101.79  561.22 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   545.89244    7.44101  73.363   <2e-16 ***
## near_stn_dist  -0.07363    0.00871  -8.454   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 167 on 1295 degrees of freedom
## Multiple R-squared:  0.0523, Adjusted R-squared:  0.05157 
## F-statistic: 71.47 on 1 and 1295 DF,  p-value: < 2.2e-16
sink()

#Test simple lm for flat
sl_near_stn_dist_slice_flat<-lm(price~near_stn_dist,data=subset(Londonclean,flat_ind==1))
plot(sl_near_stn_dist_slice_flat)

sink("sl_near_stn_dist_slice_flat.txt")
summary(sl_near_stn_dist_slice_flat)
## 
## Call:
## lm(formula = price ~ near_stn_dist, data = subset(Londonclean, 
##     flat_ind == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -380.38 -117.82  -29.04   90.18  681.86 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   479.078892   4.295352  111.53   <2e-16 ***
## near_stn_dist  -0.080356   0.005672  -14.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 176 on 4713 degrees of freedom
## Multiple R-squared:  0.04084,    Adjusted R-squared:  0.04064 
## F-statistic: 200.7 on 1 and 4713 DF,  p-value: < 2.2e-16
sink()

#Test simple lm for leasehold
sl_near_stn_dist_slice_lease<-lm(price~near_stn_dist,data=subset(Londonclean,lease_ind==1))
plot(sl_near_stn_dist_slice_lease)

sink("sl_near_stn_dist_slice_lease.txt")
summary(sl_near_stn_dist_slice_lease)
## 
## Call:
## lm(formula = price ~ near_stn_dist, data = subset(Londonclean, 
##     lease_ind == 1))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -380.65 -118.30  -29.49   90.01  679.54 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   479.278652   4.268139  112.29   <2e-16 ***
## near_stn_dist  -0.078950   0.005617  -14.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 176.8 on 4809 degrees of freedom
## Multiple R-squared:  0.03946,    Adjusted R-squared:  0.03926 
## F-statistic: 197.6 on 1 and 4809 DF,  p-value: < 2.2e-16
sink()

#Test simple lm for Camden
sl_near_stn_dist_slice_camden<-lm(price~near_stn_dist,data=subset(Londonclean,district=="CAMDEN"))
plot(sl_near_stn_dist_slice_camden)

sink("sl_near_stn_dist_slice_camden.txt")
summary(sl_near_stn_dist_slice_camden)
## 
## Call:
## lm(formula = price ~ near_stn_dist, data = subset(Londonclean, 
##     district == "CAMDEN"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -441.38 -159.52    8.86  128.91  495.25 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   522.33537   36.29286  14.392  < 2e-16 ***
## near_stn_dist   0.22207    0.08278   2.683  0.00843 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 202.1 on 110 degrees of freedom
## Multiple R-squared:  0.0614, Adjusted R-squared:  0.05287 
## F-statistic: 7.196 on 1 and 110 DF,  p-value: 0.008432
sink()

#To add distance to centre variable, pick Piccadilly Circus as the centre for London

start.time <- Sys.time()
#######
for (i in 1:nrow(Londonclean)) {

  a<-st_coordinates(subset(stn,Name=="Piccadilly Circus"))[,1]
  b<-st_coordinates(subset(stn,Name=="Piccadilly Circus"))[,2]
  c<-Londonclean$near_stn_lon[i]
  d<-Londonclean$near_stn_lat[i]

  Londonclean$centre_dist[i]<-round(distm(c(a,b),c(c,d), fun = distHaversine))
}
######
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken #1.413502mins to run
## Time difference of 1.423664 mins
#Save final version of study data
write.csv(Londonclean,"LondonppNov2018final.csv")

#Test simple lm for distance to centre
sl_centre_dist<-lm(price~centre_dist,data=Londonclean)
sink("sl_centre_dist.txt")
summary(sl_centre_dist)
## 
## Call:
## lm(formula = price ~ centre_dist, data = Londonclean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -460.99 -127.25  -33.21  103.27  722.14 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.928e+02  5.005e+00  118.44   <2e-16 ***
## centre_dist -9.033e-03  3.532e-04  -25.58   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 184.3 on 8202 degrees of freedom
## Multiple R-squared:  0.07387,    Adjusted R-squared:  0.07376 
## F-statistic: 654.2 on 1 and 8202 DF,  p-value: < 2.2e-16
sink()


#Test multiple regresssion for all explanatory variables
ml_all<-lm(price~near_stn_dist+centre_dist+flat_ind+lease_ind+new_build_ind,data=Londonclean)
#plot(ml_all)
sink("ml_all.txt")
summary(ml_all)
## 
## Call:
## lm(formula = price ~ near_stn_dist + centre_dist + flat_ind + 
##     lease_ind + new_build_ind, data = Londonclean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -523.79 -106.47  -19.00   86.45  706.13 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    8.022e+02  6.196e+00 129.483  < 2e-16 ***
## near_stn_dist -2.201e-02  3.587e-03  -6.135 8.89e-10 ***
## centre_dist   -1.547e-02  3.636e-04 -42.556  < 2e-16 ***
## flat_ind      -1.007e+02  1.398e+01  -7.199 6.59e-13 ***
## lease_ind     -1.135e+02  1.403e+01  -8.093 6.65e-16 ***
## new_build_ind  9.093e+01  5.117e+00  17.770  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 160.8 on 8198 degrees of freedom
## Multiple R-squared:  0.2954, Adjusted R-squared:  0.2949 
## F-statistic: 687.3 on 5 and 8198 DF,  p-value: < 2.2e-16
sink()

#######################

#Trim LondonMap name to append price data calculated later
LondonMap$lad15nm<-gsub(" ","_",toupper(unique(LondonMap$lad15nm)))

#Update the average price
London_avg_price=ave(Londonclean$price)[1]
London_avg_price
## [1] 475.8
#update average price by borough
pricebyBorough<-aggregate(Londonclean$price, by=list(borough=Londonclean$district), FUN=mean)
pricebyBorough$x<-round(pricebyBorough$x)
names(pricebyBorough)=c("borough","price_per_property")

#calculate counts of transactions by borough
#countsbyBorough<-Londonclean %>% 
#    as.tibble() %>% 
#     count(district)
#countsbyBorough

#Append the updated price by borough to the main dataframe
#Londonclean$pricebyBorough <- pricebyBorough$price_per_property[match(Londonclean$district,pricebyBorough$borough)]

#sf object for the main dataframe to be updated
#Convert sp objects into simple features
#rm(LondonMap_sf)
#LondonMap_sf<-st_as_sf(LondonMap)

#Convert the crs to EPSG:4326 and check
#LondonMap_sf<-st_transform(LondonMap_sf,latlon)

#Append the updated price per borough to London borough boundaries. First make sure the field values are consistent
pricebyBorough$borough<-sub("CITY_OF_WESTMINSTER","WESTMINSTER",pricebyBorough$borough)
LondonMap_sf$lad15nm<-toupper(LondonMap_sf$lad15nm)
LondonMap_sf$lad15nm<-gsub(" ","_",LondonMap_sf$lad15nm)
LondonMap_sf$pricebyBorough <- pricebyBorough$price_per_property[match(LondonMap_sf$lad15nm,pricebyBorough$borough)]
#######################
#Plot the price by borough as choropleth map
tmap_mode("view")
tm_shape(LondonMap_sf)+tm_polygons("pricebyBorough",style="jenks",title="Average Price per Property by Borough,£k")+tm_text("lad15nm",size=0.6)+tm_scale_bar(position = c("left", "bottom"))
write.csv(Londonclean,"LondonppNov2018final.csv")